{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 处理中文，并使用jieba分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\softwares\\python\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "d:\\softwares\\python\\lib\\site-packages\\umap\\distances.py:1063: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
      "  @numba.jit()\n",
      "d:\\softwares\\python\\lib\\site-packages\\umap\\distances.py:1071: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
      "  @numba.jit()\n",
      "d:\\softwares\\python\\lib\\site-packages\\umap\\distances.py:1086: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
      "  @numba.jit()\n",
      "d:\\softwares\\python\\lib\\site-packages\\umap\\umap_.py:660: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
      "  @numba.jit()\n"
     ]
    }
   ],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "from hdbscan import HDBSCAN\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from bertopic import BERTopic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://maartengr.github.io/BERTopic/api/bertopic.html\n",
    "The default embedding model is all-MiniLM-L6-v2 when selecting language=\"english\" and paraphrase-multilingual-MiniLM-L12-v2 when selecting language=\"multilingual\"."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "条数:  20\n",
      "预览第一条:  今天 天气 晴朗 天空 湛蓝\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# step1 加载文件\n",
    "with open('./切词.txt', 'r', encoding='utf-8') as file:\n",
    "  docs = file.readlines()\n",
    "print('条数: ', len(docs))\n",
    "print('预览第一条: ', docs[0])\n",
    "\n",
    "vectorizer_model = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 使用能处理中文的词嵌入模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_model = SentenceTransformer(\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
    "\n",
    "hdbscan_model = HDBSCAN(min_cluster_size=2)\n",
    "\n",
    "vectorizer_model = CountVectorizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Topic</th>\n",
       "      <th>Count</th>\n",
       "      <th>Name</th>\n",
       "      <th>Representation</th>\n",
       "      <th>Representative_Docs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "      <td>0_天气_影响_全球_预测</td>\n",
       "      <td>[天气, 影响, 全球, 预测, 模式, 变化, 数据, 预报, 分析, 晴朗]</td>\n",
       "      <td>[天气 预报 依赖 复杂 算法 数据 分析\\n, 气候 变化 影响 全球 天气 模式\\n, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>9</td>\n",
       "      <td>1_nlp_帮助_自然语言处理_人类</td>\n",
       "      <td>[nlp, 帮助, 自然语言处理, 人类, 任务, 理解, 分析, 用于, 科学, 社交媒体]</td>\n",
       "      <td>[NLP 帮助 进行 机器 翻译 跨语言 交流 变得 容易\\n, NLP 用于 Siri A...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Topic  Count                Name  \\\n",
       "0      0     11       0_天气_影响_全球_预测   \n",
       "1      1      9  1_nlp_帮助_自然语言处理_人类   \n",
       "\n",
       "                                    Representation  \\\n",
       "0         [天气, 影响, 全球, 预测, 模式, 变化, 数据, 预报, 分析, 晴朗]   \n",
       "1  [nlp, 帮助, 自然语言处理, 人类, 任务, 理解, 分析, 用于, 科学, 社交媒体]   \n",
       "\n",
       "                                 Representative_Docs  \n",
       "0  [天气 预报 依赖 复杂 算法 数据 分析\\n, 气候 变化 影响 全球 天气 模式\\n, ...  \n",
       "1  [NLP 帮助 进行 机器 翻译 跨语言 交流 变得 容易\\n, NLP 用于 Siri A...  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 这里有两个问题\n",
    "# 其一是每次运行模型都会执行切词操作，非常浪费时间\n",
    "# 其二是介词会包含：标点符号、的、了等虚词，这些词更应该被排除在外\n",
    "topic_model = BERTopic(\n",
    "  embedding_model=embedding_model,\n",
    "  hdbscan_model=hdbscan_model,\n",
    "  vectorizer_model=vectorizer_model,\n",
    ")\n",
    "\n",
    "topic_model.fit_transform(docs) # 拟合模型\n",
    "topic_model.get_topic_info() # 获取主题聚类信息"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
